Library Imports¶

¶

Load up Libraries / Stopwords¶

In [1]:
# Pandas / Numpy
import pandas as pd
import numpy as np
# NLTK - 3.8.1
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Regular Expressions
import re
import string
# Date/Time
import datetime
# sklearn - 1.2.2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.model_selection import cross_val_score, GridSearchCV
# Other Classification Models - LightGBM - 4.0.0 / XGBoost - 1.7.6 / CatBoost - 1.2
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
# News API
from newsapi import NewsApiClient
# Markdown
from IPython.display import Markdown, display
# Multi-Processing / Threading
from joblib import parallel_backend
In [2]:
# Download Stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
# Set stop words
stopWords = stopwords.words("english")
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SimonMurrell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SimonMurrell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SimonMurrell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [3]:
# Assign Colours
charcoalColour = "#63666A"
tealColour = "#00ACC9"
magentaColour = "#E6007E"
magentaColour = "#E6007E"
greenColour = "#80BA27"
lightGreenColour = "#B8DA25"
blueColour = "#50A2D4"
emeraldColour = "#05AF9A"
purpleColour = "#7E4CA5"
orangeColour = "#FF8300"
redColour = "#FA4616"
yellowColour = "#F3C317"
In [4]:
# News API Key - 1000 requests per 30 minutes
newsAPIKey = "d8061c628ce241bbb0a34d4b61fcd707"
In [5]:
# Assign Names
classificationName_nb = "Naive Bayes"
classificationName_lr = "Logistic Regression"
classificationName_svm = "Linear SVC"
classificationName_rf = "Random Forest"
classificationName_xg = "XG Boost"
classificationName_lg = "LightGBM"
classificationName_ab = "Adaptive Boosting"
classificationName_cb = "CatBoost"
In [ ]:
 

Load / Clean up Dataset¶

¶

Load up Datasets / Do Initial Inspection¶

In [6]:
# Load Real Datasets
real_dataset = pd.read_csv("./dataset/true.csv")
# Check Shape
real_dataset.shape
Out[6]:
(21417, 4)
In [7]:
# Load Fake Dataset
fake_dataset = pd.read_csv("./dataset/fake.csv")
# Check Shape
fake_dataset.shape
Out[7]:
(23481, 4)
In [ ]:
 
In [8]:
# Check True Dataset
real_dataset.head()
Out[8]:
title text subject date
0 As U.S. budget fight looms, Republicans flip t... WASHINGTON (Reuters) - The head of a conservat... politicsNews December 31, 2017
1 U.S. military to accept transgender recruits o... WASHINGTON (Reuters) - Transgender people will... politicsNews December 29, 2017
2 Senior U.S. Republican senator: 'Let Mr. Muell... WASHINGTON (Reuters) - The special counsel inv... politicsNews December 31, 2017
3 FBI Russia probe helped by Australian diplomat... WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews December 30, 2017
4 Trump wants Postal Service to charge 'much mor... SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews December 29, 2017
In [9]:
# Check Fake Dataset
fake_dataset.head()
Out[9]:
title text subject date
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... News December 31, 2017
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... News December 31, 2017
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... News December 30, 2017
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... News December 29, 2017
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... News December 25, 2017
In [ ]:
 
In [10]:
# Add Column
real_dataset["label"] = 1
fake_dataset["label"] = 0
In [11]:
# Drop Columns
real_dataset = real_dataset.drop("subject", axis=1)
real_dataset = real_dataset.drop("date", axis=1)
fake_dataset = fake_dataset.drop("subject", axis=1)
fake_dataset = fake_dataset.drop("date", axis=1)
In [ ]:
 
In [12]:
# Check Shape
real_dataset.shape
Out[12]:
(21417, 3)
In [13]:
# Check True Dataset
real_dataset.head()
Out[13]:
title text label
0 As U.S. budget fight looms, Republicans flip t... WASHINGTON (Reuters) - The head of a conservat... 1
1 U.S. military to accept transgender recruits o... WASHINGTON (Reuters) - Transgender people will... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... WASHINGTON (Reuters) - The special counsel inv... 1
3 FBI Russia probe helped by Australian diplomat... WASHINGTON (Reuters) - Trump campaign adviser ... 1
4 Trump wants Postal Service to charge 'much mor... SEATTLE/WASHINGTON (Reuters) - President Donal... 1
In [14]:
# Check Shape
fake_dataset.shape
Out[14]:
(23481, 3)
In [15]:
# Check Fake Dataset
fake_dataset.head()
Out[15]:
title text label
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... 0
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... 0
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... 0
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... 0
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... 0
In [ ]:
 

Combine Datasets¶

In [ ]:
 

Strip out Prefix in Real News¶

In [16]:
def extractTextAfterHyphen(text):
    
    # Check Value
    parts = text.split("-")
    # Check Value
    if len(parts) > 1:
        
        # Return
        return parts[1].strip()
    
    else:
        
        # Return
        return text
    
In [17]:
# Strip out Prefix
real_dataset["text"] = real_dataset["text"].apply(extractTextAfterHyphen)
In [18]:
# Check Real Dataset
real_dataset.head()
Out[18]:
title text label
0 As U.S. budget fight looms, Republicans flip t... The head of a conservative Republican faction ... 1
1 U.S. military to accept transgender recruits o... Transgender people will be allowed for the fir... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... The special counsel investigation of links bet... 1
3 FBI Russia probe helped by Australian diplomat... Trump campaign adviser George Papadopoulos tol... 1
4 Trump wants Postal Service to charge 'much mor... President Donald Trump called on the U.S. Post... 1
In [ ]:
 
In [19]:
# Combine Datasets
news_dataset = pd.concat([real_dataset, fake_dataset])
In [20]:
# Check Shape
news_dataset.shape
Out[20]:
(44898, 3)
In [21]:
# Check Fake Dataset
news_dataset.head()
Out[21]:
title text label
0 As U.S. budget fight looms, Republicans flip t... The head of a conservative Republican faction ... 1
1 U.S. military to accept transgender recruits o... Transgender people will be allowed for the fir... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... The special counsel investigation of links bet... 1
3 FBI Russia probe helped by Australian diplomat... Trump campaign adviser George Papadopoulos tol... 1
4 Trump wants Postal Service to charge 'much mor... President Donald Trump called on the U.S. Post... 1
In [ ]:
 

Info / Describe Dataset¶

In [22]:
# Check Info
news_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   label   44898 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
In [23]:
# Describe
news_dataset.describe()
Out[23]:
label
count 44898.000000
mean 0.477015
std 0.499477
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
In [ ]:
 

Real vs Fake News Breakdown¶

In [24]:
# Get Label Value Counts
labelCounts = news_dataset["label"].value_counts()
# Get Label Mappings
labelMappings = {0: "Fake News", 1: "Real News"}
# Setup Colours
colours = [charcoalColour, tealColour]
# Create Pie Chart
plt.pie(labelCounts, labels=labelCounts.index.map(labelMappings), autopct="%1.1f%%", colors=colours)
# Set Legend
plt.legend(title="News Type", labels=["Fake News", "Real News"], loc="center left", bbox_to_anchor=(1, 0.5))
# Set Title
plt.title("Real vs Fake News")
# Show
plt.show()
In [ ]:
 

Pre-processing Data¶

¶

Do Initial Inspection¶

In [25]:
# Check for missing values
missing_values = news_dataset.isnull().sum()
# Print 
missing_values
Out[25]:
title    0
text     0
label    0
dtype: int64
In [ ]:
 

Remove Invalid Data¶

In [26]:
# Check if there is any unrealiable data

# Print Missing Values
print(f'Missing Values')
print(f'Title: {missing_values.title}')
print(f'Text: {missing_values.text}')
print(f'Label: {missing_values.label}')
# Drop NA
news_dataset = news_dataset.dropna()
# Get Some Stats
news_dataset.describe()
Missing Values
Title: 0
Text: 0
Label: 0
Out[26]:
label
count 44898.000000
mean 0.477015
std 0.499477
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
In [27]:
# Print Real News Head
news_dataset.head()
Out[27]:
title text label
0 As U.S. budget fight looms, Republicans flip t... The head of a conservative Republican faction ... 1
1 U.S. military to accept transgender recruits o... Transgender people will be allowed for the fir... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... The special counsel investigation of links bet... 1
3 FBI Russia probe helped by Australian diplomat... Trump campaign adviser George Papadopoulos tol... 1
4 Trump wants Postal Service to charge 'much mor... President Donald Trump called on the U.S. Post... 1
In [28]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[28]:
title text label
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... 0
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... 0
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... 0
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... 0
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... 0
In [ ]:
 

Lowercase Characters¶

In [29]:
# Lowercase words
news_dataset["title"] = news_dataset["title"].str.lower()
news_dataset["text"] = news_dataset["text"].str.lower()
In [ ]:
 
In [30]:
# Print Real News Head
news_dataset.head()
Out[30]:
title text label
0 as u.s. budget fight looms, republicans flip t... the head of a conservative republican faction ... 1
1 u.s. military to accept transgender recruits o... transgender people will be allowed for the fir... 1
2 senior u.s. republican senator: 'let mr. muell... the special counsel investigation of links bet... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump wants postal service to charge 'much mor... president donald trump called on the u.s. post... 1
In [31]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[31]:
title text label
0 donald trump sends out embarrassing new year’... donald trump just couldn t wish all americans ... 0
1 drunk bragging trump staffer started russian ... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke... on friday, it was revealed that former milwauk... 0
3 trump is so obsessed he even has obama’s name... on christmas day, donald trump announced that ... 0
4 pope francis just called out donald trump dur... pope francis used his annual christmas day mes... 0
In [ ]:
 

Remove URLs¶

In [32]:
def removeUrls(text):
    
    # Setup Pattern
    pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    # Sub
    retVal = re.sub(pattern, "", text)
    # Return
    return retVal

# Remove URLs
news_dataset["title"] = news_dataset["title"].apply(removeUrls)
news_dataset["text"] = news_dataset["text"].apply(removeUrls)
In [33]:
# Print Real News Head
news_dataset.head()
Out[33]:
title text label
0 as u.s. budget fight looms, republicans flip t... the head of a conservative republican faction ... 1
1 u.s. military to accept transgender recruits o... transgender people will be allowed for the fir... 1
2 senior u.s. republican senator: 'let mr. muell... the special counsel investigation of links bet... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump wants postal service to charge 'much mor... president donald trump called on the u.s. post... 1
In [34]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[34]:
title text label
0 donald trump sends out embarrassing new year’... donald trump just couldn t wish all americans ... 0
1 drunk bragging trump staffer started russian ... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke... on friday, it was revealed that former milwauk... 0
3 trump is so obsessed he even has obama’s name... on christmas day, donald trump announced that ... 0
4 pope francis just called out donald trump dur... pope francis used his annual christmas day mes... 0
In [ ]:
 

Remove Punctuation¶

In [35]:
# Remove Punctuation
removepunctuation = str.maketrans("", "", string.punctuation)

news_dataset["title"] = news_dataset["title"].str.translate(removepunctuation)
news_dataset["text"] = news_dataset["text"].str.translate(removepunctuation)
In [ ]:
 
In [36]:
# Print Real News Head
news_dataset.head()
Out[36]:
title text label
0 as us budget fight looms republicans flip thei... the head of a conservative republican faction ... 1
1 us military to accept transgender recruits on ... transgender people will be allowed for the fir... 1
2 senior us republican senator let mr mueller do... the special counsel investigation of links bet... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump wants postal service to charge much more... president donald trump called on the us postal... 1
In [37]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[37]:
title text label
0 donald trump sends out embarrassing new year’... donald trump just couldn t wish all americans ... 0
1 drunk bragging trump staffer started russian ... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke... on friday it was revealed that former milwauke... 0
3 trump is so obsessed he even has obama’s name... on christmas day donald trump announced that h... 0
4 pope francis just called out donald trump dur... pope francis used his annual christmas day mes... 0
In [ ]:
 

Remove Numbers¶

In [38]:
# Remove Numbers
news_dataset["title"] = news_dataset["title"].str.replace(r"\d+", "", regex=True)
news_dataset["text"] = news_dataset["text"].str.replace(r"\d+", "", regex=True)
In [ ]:
 
In [39]:
# Print Real News Head
news_dataset.head()
Out[39]:
title text label
0 as us budget fight looms republicans flip thei... the head of a conservative republican faction ... 1
1 us military to accept transgender recruits on ... transgender people will be allowed for the fir... 1
2 senior us republican senator let mr mueller do... the special counsel investigation of links bet... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump wants postal service to charge much more... president donald trump called on the us postal... 1
In [40]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[40]:
title text label
0 donald trump sends out embarrassing new year’... donald trump just couldn t wish all americans ... 0
1 drunk bragging trump staffer started russian ... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke... on friday it was revealed that former milwauke... 0
3 trump is so obsessed he even has obama’s name... on christmas day donald trump announced that h... 0
4 pope francis just called out donald trump dur... pope francis used his annual christmas day mes... 0
In [ ]:
 

Remove Unicode¶

In [41]:
def removeUnicode(text):
    
    # Return
    return text.encode("ascii", "ignore").decode("ascii")

# Remove Unicode
news_dataset["title"] = news_dataset["title"].apply(removeUnicode)
news_dataset["text"] = news_dataset["text"].apply(removeUnicode)
In [ ]:
 
In [42]:
# Print Real News Head
news_dataset.head()
Out[42]:
title text label
0 as us budget fight looms republicans flip thei... the head of a conservative republican faction ... 1
1 us military to accept transgender recruits on ... transgender people will be allowed for the fir... 1
2 senior us republican senator let mr mueller do... the special counsel investigation of links bet... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump wants postal service to charge much more... president donald trump called on the us postal... 1
In [43]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[43]:
title text label
0 donald trump sends out embarrassing new years... donald trump just couldn t wish all americans ... 0
1 drunk bragging trump staffer started russian ... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke... on friday it was revealed that former milwauke... 0
3 trump is so obsessed he even has obamas name ... on christmas day donald trump announced that h... 0
4 pope francis just called out donald trump dur... pope francis used his annual christmas day mes... 0
In [ ]:
 

Lemmatize Words¶

In [44]:
# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
In [45]:
def lemmatizeText(text):
    
    # Extract Words from Text
    words = word_tokenize(text)
    # Lemmatize Words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin Words
    retVal = " ".join(lemmatized_words) 
    # Return
    return retVal

# Lemmatize Words
news_dataset["title"] = news_dataset["title"].apply(lemmatizeText)
news_dataset["text"] = news_dataset["text"].apply(lemmatizeText)
In [ ]:
 
In [46]:
# Print Real News Head
news_dataset.head()
Out[46]:
title text label
0 a u budget fight loom republican flip their fi... the head of a conservative republican faction ... 1
1 u military to accept transgender recruit on mo... transgender people will be allowed for the fir... 1
2 senior u republican senator let mr mueller do ... the special counsel investigation of link betw... 1
3 fbi russia probe helped by australian diplomat... trump campaign adviser george papadopoulos tol... 1
4 trump want postal service to charge much more ... president donald trump called on the u postal ... 1
In [47]:
# Print Fake News Head
news_dataset[news_dataset["label"] == 0].head()
Out[47]:
title text label
0 donald trump sends out embarrassing new year e... donald trump just couldn t wish all american a... 0
1 drunk bragging trump staffer started russian c... house intelligence committee chairman devin nu... 0
2 sheriff david clarke becomes an internet joke ... on friday it wa revealed that former milwaukee... 0
3 trump is so obsessed he even ha obamas name co... on christmas day donald trump announced that h... 0
4 pope francis just called out donald trump duri... pope francis used his annual christmas day mes... 0
In [ ]:
 
In [48]:
# Current techniques used in full project:

# Remove unreliable data
# Lowercase words
# Remove URLs
# Remove punctuation
# Remove Numbers
# Remove Unicode Characters
# Lemmatization / Stemming (Not Applied)
# Stop word removal - Not used as it reduced performance
In [ ]:
 

Title / Text / Word Analysis¶

¶

Build Datasets for Analysis¶

In [49]:
# Setup Count Vectorizer
titleRealWordCounter = CountVectorizer()
# Fit / Transform Title
titleRealBow = titleRealWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 1, "title"])
# Extract Feature Names
titleRealFeatureNames = titleRealWordCounter.get_feature_names_out()
# Get Word Counts
titleRealWordCount = titleRealBow.sum(axis=0)
# Setup Dictionary
titleRealWordCountDict = dict(zip(titleRealFeatureNames, titleRealWordCount.tolist()[0]))
# Sort Word Count Dictionary
titleRealSortedWordCountDict = sorted(titleRealWordCountDict.items(), key=lambda x: x[1], reverse=True)
In [50]:
# Generate DataFrame
dfTitleRealWords = pd.DataFrame(titleRealSortedWordCountDict)
# Assign Values
dfTitleRealWords = dfTitleRealWords.rename(columns={0: "word"})
dfTitleRealWords = dfTitleRealWords.rename(columns={1: "count"})
dfTitleRealWords["type"] = "title"
dfTitleRealWords["label"] = 1
In [51]:
# Setup Count Vectorizer
titleFakeWordCounter = CountVectorizer()
# Fit / Transform Title
titleFakeBow = titleFakeWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 0, "title"])
# Extract Feature Names
titleFakeFeatureNames = titleFakeWordCounter.get_feature_names_out()
# Get Word Counts
titleFakeWordCount = titleFakeBow.sum(axis=0)
# Setup Dictionary
titleFakeWordCountDict = dict(zip(titleFakeFeatureNames, titleFakeWordCount.tolist()[0]))
# Sort Word Count Dictionary
titleFakeSortedWordCountDict = sorted(titleFakeWordCountDict.items(), key=lambda x: x[1], reverse=True)
In [52]:
# Generate DataFrame
dfTitleFakeWords = pd.DataFrame(titleFakeSortedWordCountDict)
# Assign Values
dfTitleFakeWords = dfTitleFakeWords.rename(columns={0: "word"})
dfTitleFakeWords = dfTitleFakeWords.rename(columns={1: "count"})
dfTitleFakeWords["type"] = "title"
dfTitleFakeWords["label"] = 0
In [ ]:
 
In [53]:
# Setup Count Vectorizer
textRealWordCounter = CountVectorizer()
# Fit / Transform Text
textRealBow = textRealWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 1, "text"])
# Extract Feature Names
textRealFeatureNames = textRealWordCounter.get_feature_names_out()
# Get Word Counts
textRealWordCount = textRealBow.sum(axis=0)
# Setup Dictionary
textRealWordCountDict = dict(zip(textRealFeatureNames, textRealWordCount.tolist()[0]))
# Sort Word Count Dictionary
textRealSortedWordCountDict = sorted(textRealWordCountDict.items(), key=lambda x: x[1], reverse=True)
In [54]:
# Generate DataFrame
dfTextRealWords = pd.DataFrame(textRealSortedWordCountDict)
# Assign Values
dfTextRealWords = dfTextRealWords.rename(columns={0: "word"})
dfTextRealWords = dfTextRealWords.rename(columns={1: "count"})
dfTextRealWords["word_length"] = dfTextRealWords["word"].apply(lambda x: len(x))
dfTextRealWords["type"] = "text"
dfTextRealWords["label"] = 1
In [55]:
# Setup Count Vectorizer
textFakeWordCounter = CountVectorizer()
# Fit / Transform Text
textFakeBow = textFakeWordCounter.fit_transform(news_dataset.loc[news_dataset["label"] == 0, "text"])
# Extract Feature Names
textFakeFeatureNames = textFakeWordCounter.get_feature_names_out()
# Get Word Counts
textFakeWordCount = textFakeBow.sum(axis=0)
# Setup Dictionary
textFakeWordCountDict = dict(zip(textFakeFeatureNames, textFakeWordCount.tolist()[0]))
# Sort Word Count Dictionary
textFakeSortedWordCountDict = sorted(textFakeWordCountDict.items(), key=lambda x: x[1], reverse=True)
In [56]:
# Generate DataFrame
dfTextFakeWords = pd.DataFrame(textFakeSortedWordCountDict)
# Assign Values
dfTextFakeWords = dfTextFakeWords.rename(columns={0: "word"})
dfTextFakeWords = dfTextFakeWords.rename(columns={1: "count"})
dfTextFakeWords["type"] = "text"
dfTextFakeWords["label"] = 0
In [ ]:
 
In [57]:
# Get Title Words
dfTitleWords = pd.merge(dfTitleRealWords, dfTitleFakeWords, on="word", how="outer")
# Drop Columns
dfTitleWords = dfTitleWords.drop("type_x", axis=1)
dfTitleWords = dfTitleWords.drop("label_x", axis=1)
dfTitleWords = dfTitleWords.drop("type_y", axis=1)
dfTitleWords = dfTitleWords.drop("label_y", axis=1)
# Rename Columns
dfTitleWords = dfTitleWords.rename(columns={"count_x": "real_count"})
dfTitleWords = dfTitleWords.rename(columns={"count_y": "fake_count"})
# Remove stop words
dfTitleWords["word"] = dfTitleWords["word"].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stopWords]))
dfTitleWords = dfTitleWords[dfTitleWords["word"] != ""]
# Drop NA
dfTitleWords = dfTitleWords.dropna()
# Set Types
dfTitleWords["real_count"] = dfTitleWords["real_count"].astype(int)
dfTitleWords["fake_count"] = dfTitleWords["fake_count"].astype(int)
# Add Column
dfTitleWords["total_count"] = dfTitleWords["real_count"] + dfTitleWords["fake_count"]
# Sort
dfTitleWords = dfTitleWords.sort_values(by="total_count", ascending=False)

# Get Text Words
dfTextWords = pd.merge(dfTextRealWords, dfTextFakeWords, on="word", how="outer")
# Drop Columns
dfTextWords = dfTextWords.drop("type_x", axis=1)
dfTextWords = dfTextWords.drop("label_x", axis=1)
dfTextWords = dfTextWords.drop("type_y", axis=1)
dfTextWords = dfTextWords.drop("label_y", axis=1)
# Rename Columns
dfTextWords = dfTextWords.rename(columns={"count_x": "real_count"})
dfTextWords = dfTextWords.rename(columns={"count_y": "fake_count"})
# Remove stop words
dfTextWords["word"] = dfTextWords["word"].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stopWords]))
dfTextWords = dfTextWords[dfTextWords["word"] != ""]
# Drop NA
dfTextWords = dfTextWords.dropna()
# Set Types
dfTextWords["real_count"] = dfTextWords["real_count"].astype(int)
dfTextWords["fake_count"] = dfTextWords["fake_count"].astype(int)
# Add Column
dfTextWords["total_count"] = dfTextWords["real_count"] + dfTextWords["fake_count"]
# Sort
dfTextWords = dfTextWords.sort_values(by="total_count", ascending=False)
In [ ]:
 
In [58]:
# Get Real Titles
dfRealTitles = news_dataset[news_dataset["label"] == 1]
# Drop Columns
dfRealTitles = dfRealTitles.drop("text", axis=1)
dfRealTitles = dfRealTitles.drop("label", axis=1)
# Add Column
dfRealTitles["title_length"] = dfRealTitles["title"].apply(lambda x: len(x))
# Remove Rubbish
dfRealTitles = dfRealTitles[dfRealTitles["title"] != ""]
# Sort
dfRealTitles = dfRealTitles.sort_values(by="title_length", ascending=True)

# Get Fake Titles
dfFakeTitles = news_dataset[news_dataset["label"] == 0]
# Drop Columns
dfFakeTitles = dfFakeTitles.drop("text", axis=1)
dfFakeTitles = dfFakeTitles.drop("label", axis=1)
# Add Column
dfFakeTitles["title_length"] = dfFakeTitles["title"].apply(lambda x: len(x))
# Remove Rubbish
dfFakeTitles = dfFakeTitles[dfFakeTitles["title"] != ""]
# Sort
dfFakeTitles = dfFakeTitles.sort_values(by="title_length", ascending=True)

# Get Real Title Length Average / Size
dfRealTitleLengths = dfRealTitles.groupby("title_length").size().reset_index(name="count")
# Get Real Title Length Average / Size
dfFakeTitleLengths = dfFakeTitles.groupby("title_length").size().reset_index(name="count")
In [ ]:
 

Article Title Lengths - Real vs Fake¶

In [59]:
# Set Lines
plt.plot(dfRealTitleLengths["title_length"], dfRealTitleLengths["count"], label="Real", c=charcoalColour)
plt.plot(dfFakeTitleLengths["title_length"], dfFakeTitleLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Article Title Lengths - Real vs Fake")
plt.xlabel("Title Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
In [ ]:
 
In [60]:
# Get Real Text
dfRealText = news_dataset[news_dataset["label"] == 1]
# Drop Columns
dfRealText = dfRealText.drop("title", axis=1)
dfRealText = dfRealText.drop("label", axis=1)
# Add Column
dfRealText["text_length"] = dfRealText["text"].apply(lambda x: len(x))
# Remove Rubbish
dfRealText = dfRealText[dfRealText["text_length"] > 250]
# Sort
dfRealText = dfRealText.sort_values(by="text_length", ascending=True)

# Get Fake Text
dfFakeText = news_dataset[news_dataset["label"] == 0]
# Drop Columns
dfFakeText = dfFakeText.drop("title", axis=1)
dfFakeText = dfFakeText.drop("label", axis=1)
# Add Column
dfFakeText["text_length"] = dfFakeText["text"].apply(lambda x: len(x))
# Remove Rubbish
dfFakeText = dfFakeText[dfFakeText["text_length"] > 250]
# Sort
dfFakeText = dfFakeText.sort_values(by="text_length", ascending=True)

# Get Real Text Length Average / Size
dfRealTextLengths = dfRealText.groupby("text_length").size().reset_index(name="count")
# Get Real Text Length Average / Size
dfFakeTextLengths = dfFakeText.groupby("text_length").size().reset_index(name="count")
In [ ]:
 

Article Text Lengths - Real vs Fake¶

In [61]:
# Set Scatter
plt.scatter(dfRealTextLengths["text_length"], dfRealTextLengths["count"], label="Real", c=charcoalColour)
plt.scatter(dfFakeTextLengths["text_length"], dfFakeTextLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Article Text Lengths - Real vs Fake")
plt.xlabel("Text Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
In [ ]:
 
In [62]:
# Get Real Titles
dfRealTitles = dfTitleRealWords
# Drop Columns
dfRealTitles = dfRealTitles.drop("count", axis=1)
dfRealTitles = dfRealTitles.drop("type", axis=1)
dfRealTitles = dfRealTitles.drop("label", axis=1)
# Add Column
dfRealTitles["word_length"] = dfRealTitles["word"].apply(lambda x: len(x))
# Sort
dfRealTitles = dfRealTitles.sort_values(by="word_length", ascending=True)

# Get Fake Titles
dfFakeTitles = dfTitleFakeWords
# Drop Columns
dfFakeTitles = dfFakeTitles.drop("count", axis=1)
dfFakeTitles = dfFakeTitles.drop("type", axis=1)
dfFakeTitles = dfFakeTitles.drop("label", axis=1)
# Add Column
dfFakeTitles["word_length"] = dfFakeTitles["word"].apply(lambda x: len(x))
# Sort
dfFakeTitles = dfFakeTitles.sort_values(by="word_length", ascending=True)

# Get Real Word Length Average / Size
dfRealWordLengths = dfRealTitles.groupby("word_length").size().reset_index(name="count")
# Get Real Word Length Average / Size
dfFakeWordLengths = dfFakeTitles.groupby("word_length").size().reset_index(name="count")
In [ ]:
 

Word Lengths - Real vs Fake¶

In [63]:
# Set Lines
plt.plot(dfRealWordLengths["word_length"], dfRealWordLengths["count"], label="Real", c=charcoalColour)
plt.plot(dfFakeWordLengths["word_length"], dfFakeWordLengths["count"], label="Fake", c=tealColour)
# Set Title
plt.title("Word Lengths - Real vs Fake")
plt.xlabel("Word Lengths")
plt.ylabel("Count")
# Set Legend
plt.legend()
# Show
plt.show()
In [ ]:
 

Top 25 Real vs Fake Words in Article Title¶

In [64]:
# Assign Values
words = dfTitleWords["word"][:25]
real_counts = dfTitleWords["real_count"][:25]
fake_counts = dfTitleWords["fake_count"][:25]
# Set up horizontal bars
plt.barh(words, real_counts, label="Real Words", color=charcoalColour)
plt.barh(words, fake_counts, left=real_counts, label="Fake Words", color=tealColour)
# Set Titles
plt.xlabel("Counts")
plt.ylabel("Words")
plt.title("Top 25 Real vs Fake Words in Article Title")
# Set Legend
plt.legend()
# Show
plt.show()
In [ ]:
 

Top 25 Real vs Fake Words in Article Text¶

In [65]:
# Assign Values
words = dfTextWords["word"][:25]
real_counts = dfTextWords["real_count"][:25]
fake_counts = dfTextWords["fake_count"][:25]
# Set up horizontal bars
plt.barh(words, real_counts, label="Real Words", color=charcoalColour)
plt.barh(words, fake_counts, left=real_counts, label="Fake Words", color=tealColour)
# Set Titles
plt.xlabel("Counts")
plt.ylabel("Words")
plt.title("Top 25 Real vs Fake Words in Article Text")
# Set Legend
plt.legend()
# Show
plt.show()
In [ ]:
 

Split up Train / Testing Data¶

¶

In [66]:
# Setup Label Encoder
labelEncoder = LabelEncoder()
# Train Dataset
X = news_dataset["title"] + " " + news_dataset["text"]
y = news_dataset["label"]
# Fit / Transform
y = labelEncoder.fit_transform(y)
# Train / Test Split based on 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Setup Labels
labels = ["Fake", "Real"]
In [ ]:
 

Build Classification Pipelines¶

¶

In [ ]:
 
In [67]:
def buildPipeline(classifier, parameters_grid):
    
    # Print Status
    #print(f"Building Pipeline - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Setup TfIdfVectorizer
    tfidfVectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), min_df=2, max_df=0.6, sublinear_tf=True)
    # Setup Pipeline
    model = Pipeline(steps=[
        ("tfidfvectorizer", tfidfVectorizer),
        ("model", classifier)
    ])
    # Fit Model
    model.fit(X_train, y_train)
    
    # Check Value
    if (parameters_grid is not None):
    
        # Run across multi-processors
        with parallel_backend("multiprocessing"):

            # Setup Grid Search CV
            grid_search = GridSearchCV(model, parameters_grid, n_jobs=-1)
            # Fit Model
            grid_search.fit(X_train, y_train)

        # Re-assign Modal
        model = grid_search.best_estimator_

    # Predict
    y_pred = model.predict(X_test)
    
    # Print Status
    #print(f"Built Pipeline - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Return
    return tfidfVectorizer, classifier, model, y_pred
In [68]:
def buildConfusionMatrix(y_test, y_pred):
    
    # Setup Confusion Matrix
    confusionMatrix = confusion_matrix(y_test, y_pred)
    # Switch
    confusionMatrix = [confusionMatrix[1], confusionMatrix[0]]
    # Setup Colours
    colours = [charcoalColour, tealColour]
    # Setup Heatmap
    figure_confusionMatrix = go.Figure(data=go.Heatmap(z=confusionMatrix,
                                      x=labels,
                                      y=labels[::-1],
                                      colorscale=colours,
                                      hoverinfo="z",
                                      showscale=False,
                                      text=confusionMatrix))
    # Build Classification Report
    classificationReport = classification_report(y_test, y_pred, output_dict=True)
    # Assign Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print Output Metrics
    print(classification_report(y_test, y_pred, target_names=labels))
    # Print Accuracy
    print("Accuracy:", accuracy)
    
    # Return
    return confusionMatrix, figure_confusionMatrix, classificationReport, accuracy
In [69]:
def appendMetrics(name, classificationReport, metrics):
    
    # Check Value
    if (name == "Naive Bayes"):
    
        # Setup Results
        metrics = pd.DataFrame({
            "Name": name,
            "Accuracy": str(round(classificationReport["accuracy"] * 100, 4)),
            "Precision": str(round(classificationReport["macro avg"]["precision"] * 100, 4)),
            "Recall": str(round(classificationReport["macro avg"]["recall"] * 100, 4)),
            "F1-Score": str(round(classificationReport["macro avg"]["f1-score"] * 100, 4))
            }, index=[0])
        
    else:
        
        # Setup Results
        newRow = pd.Series({
            "Name": name,
            "Accuracy": str(round(classificationReport["accuracy"] * 100, 4)),
            "Precision": str(round(classificationReport["macro avg"]["precision"] * 100, 4)),
            "Recall": str(round(classificationReport["macro avg"]["recall"] * 100, 4)),
            "F1-Score": str(round(classificationReport["macro avg"]["f1-score"] * 100, 4))
            })
        # Append Results
        metrics = pd.concat([metrics, newRow.to_frame().T], ignore_index=True)
        
    # Return
    return metrics
In [ ]:
 

Naive Bayes Classification Model¶

In [70]:
# Build Naive Bayes Pipeline
tfidfVectorizer_nb, classifier_nb, model_nb, y_pred_nb = buildPipeline(MultinomialNB(alpha=0.1, fit_prior=False), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_nb, fig_cm_nb, classification_nb, accuracy_nb = buildConfusionMatrix(y_test, y_pred_nb)
# Append Metrics
metrics = appendMetrics(classificationName_nb, classification_nb, None)
              precision    recall  f1-score   support

        Fake       0.96      0.97      0.96      4686
        Real       0.97      0.95      0.96      4294

    accuracy                           0.96      8980
   macro avg       0.96      0.96      0.96      8980
weighted avg       0.96      0.96      0.96      8980

Accuracy: 0.9628062360801781
In [ ]:
 

Logistic Regression Classification Model¶

In [71]:
# Build Logistic Regession Pipeline
tfidfVectorizer_lr, classifier_lr, model_lr, y_pred_lr = buildPipeline(LogisticRegression(C=50, n_jobs=-1, solver="liblinear"), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_lr, fig_cm_lr, classification_lr, accuracy_lr = buildConfusionMatrix(y_test, y_pred_lr)
# Append Metrics
metrics = appendMetrics(classificationName_lr, classification_lr, metrics)
C:\Users\SimonMurrell\anaconda3\envs\nlp\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning:

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 20.

              precision    recall  f1-score   support

        Fake       0.99      1.00      1.00      4686
        Real       1.00      0.99      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9948775055679288
In [ ]:
 

Linear SVM Classification Model¶

In [72]:
# Build Linear SVM Pipeline
tfidfVectorizer_svm, classifier_svm, model_svm, y_pred_svm = buildPipeline(SVC(C=2, kernel="linear"), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_svm, fig_cm_svm, classification_svm, accuracy_svm = buildConfusionMatrix(y_test, y_pred_svm)
# Append Metrics
metrics = appendMetrics(classificationName_svm, classification_svm, metrics)
              precision    recall  f1-score   support

        Fake       1.00      0.99      0.99      4686
        Real       0.99      1.00      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9944320712694877
In [ ]:
 

Random Forest Classification Model¶

In [73]:
# Build Random Forest Pipeline
tfidfVectorizer_rf, classifier_rf, model_rf, y_pred_rf = buildPipeline(RandomForestClassifier(n_estimators=150, max_depth=500, n_jobs=-1), None)
# Build Confusion Matrix Heat Map Figumre
confusionMatrix_rf, fig_cm_rf, classification_rf, accuracy_rf = buildConfusionMatrix(y_test, y_pred_rf)
# Append Metrics
metrics = appendMetrics(classificationName_rf, classification_rf, metrics)
              precision    recall  f1-score   support

        Fake       0.97      0.99      0.98      4686
        Real       0.99      0.97      0.98      4294

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Accuracy: 0.9791759465478842
In [ ]:
 

XG Boost Classification Model¶

In [74]:
# Build XG Boost Pipeline
tfidfVectorizer_xg, classifier_xg, model_xg, y_pred_xg = buildPipeline(XGBClassifier(max_depth=7, n_estimators=250), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_xg, fig_cm_xg, classification_xg, accuracy_xg = buildConfusionMatrix(y_test, y_pred_xg)
# Append Metrics
metrics = appendMetrics(classificationName_xg, classification_xg, metrics)
              precision    recall  f1-score   support

        Fake       0.99      0.99      0.99      4686
        Real       0.99      0.99      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9909799554565701
In [ ]:
 

LightGBM Classification Model¶

In [75]:
# Build LightGBM Pipeline
tfidfVectorizer_lg, classifier_lg, model_lg, y_pred_lg = buildPipeline(LGBMClassifier(boosting_type="gbdt", learning_rate=0.2, n_estimators=200, num_leaves=20), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_lg, fig_cm_lg, classification_lg, accuracy_lg = buildConfusionMatrix(y_test, y_pred_lg)
# Append Metrics
metrics = appendMetrics(classificationName_lg, classification_lg, metrics)
[LightGBM] [Info] Number of positive: 17123, number of negative: 18795
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 2.682172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1421730
[LightGBM] [Info] Number of data points in the train set: 35918, number of used features: 14996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476725 -> initscore=-0.093168
[LightGBM] [Info] Start training from score -0.093168
              precision    recall  f1-score   support

        Fake       0.99      0.99      0.99      4686
        Real       0.99      0.99      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9926503340757238
In [ ]:
 

Adaptive Boosting Classification Model¶

In [76]:
# Build Adaptive Boosting Pipeline
tfidfVectorizer_ab, classifier_ab, model_ab, y_pred_ab = buildPipeline(AdaBoostClassifier(learning_rate=0.5, n_estimators=500), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_ab, fig_cm_ab, classification_ab, accuracy_ab = buildConfusionMatrix(y_test, y_pred_ab)
# Append Metrics
metrics = appendMetrics(classificationName_ab, classification_ab, metrics)
              precision    recall  f1-score   support

        Fake       0.99      0.99      0.99      4686
        Real       0.99      0.99      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9913140311804008
In [ ]:
 

Cat Boost Classification Model¶

In [77]:
# Build Cat Boosting Pipeline
tfidfVectorizer_cb, classifier_cb, model_cb, y_pred_cb = buildPipeline(CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6, verbose=False), None)
# Build Confusion Matrix Heat Map Figure
confusionMatrix_cb, fig_cm_cb, classification_cb, accuracy_cb = buildConfusionMatrix(y_test, y_pred_cb)
# Append Metrics
metrics = appendMetrics(classificationName_cb, classification_cb, metrics)
              precision    recall  f1-score   support

        Fake       0.99      0.99      0.99      4686
        Real       0.99      0.99      0.99      4294

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9902004454342984
In [ ]:
 

Confusion Matrices¶

¶

In [78]:
def addConfusionMatrixFigure(fig, figure, row, col):
    
    # Add Trace
    fig.add_trace(figure, row, col)
    # Add Annotations
    annotations = figure["text"]
    # Search through Rows
    for i in range(len(annotations)):
        
        # Search through Rows
        for j in range(len(annotations[i])):
            
            # Add Annotation
            fig.add_annotation(
                x=j,
                y=i,
                text=str(annotations[i][j]),
                showarrow=False,
                font=dict(color="white", size=12),
                xref="x",
                yref="y",
                row=row, 
                col=col
            )
In [79]:
# Set up Sub Plots
fig = make_subplots(rows=4, 
                    cols=2, 
                    subplot_titles=(classificationName_nb, 
                                    classificationName_lr, 
                                    classificationName_svm, 
                                    classificationName_rf, 
                                    classificationName_xg, 
                                    classificationName_lg, 
                                    classificationName_ab, 
                                    classificationName_cb))
# Add 
addConfusionMatrixFigure(fig, fig_cm_nb.data[0], 1, 1)
addConfusionMatrixFigure(fig, fig_cm_lr.data[0], row=1, col=2)
addConfusionMatrixFigure(fig, fig_cm_svm.data[0], row=2, col=1)
addConfusionMatrixFigure(fig, fig_cm_rf.data[0], row=2, col=2)
addConfusionMatrixFigure(fig, fig_cm_xg.data[0], row=3, col=1)
addConfusionMatrixFigure(fig, fig_cm_lg.data[0], row=3, col=2)
addConfusionMatrixFigure(fig, fig_cm_ab.data[0], row=4, col=1)
addConfusionMatrixFigure(fig, fig_cm_cb.data[0], row=4, col=2)
# Update Title
fig.update_layout(title="Classification Models Confusion Matrices", 
                  title_x=0.5,
                  height=1000)
# Show Chart
fig.show()
In [ ]:
 

Summarised Performance Metrics¶

¶

In [80]:
# Update Data Frame
metrics["Accuracy"] = metrics["Accuracy"].astype(float)
metrics["Precision"] = metrics["Precision"].astype(float)
metrics["Recall"] = metrics["Recall"].astype(float)
metrics["F1-Score"] = metrics["F1-Score"].astype(float)
In [81]:
# Sort Results
metrics = metrics.copy().sort_values(by=["Accuracy"], ascending=False)
# Assign Value
metrics["Accuracy"] = metrics["Accuracy"]
# Copy DataFrame
metrics_cvs = metrics.copy()
# Output Results
metrics
Out[81]:
Name Accuracy Precision Recall F1-Score
1 Logistic Regression 99.4878 99.4887 99.4848 99.4868
2 Linear SVC 99.4432 99.4376 99.4470 99.4422
5 LightGBM 99.2650 99.2618 99.2656 99.2637
6 Adaptive Boosting 99.1314 99.1199 99.1414 99.1299
4 XG Boost 99.0980 99.0958 99.0968 99.0963
7 CatBoost 99.0200 99.0155 99.0211 99.0182
3 Random Forest 97.9176 97.9850 97.8635 97.9112
0 Naive Bayes 96.2806 96.3296 96.2297 96.2699
In [ ]:
 

Visualisations / Graphing¶

¶

In [82]:
# Assign Chart Colours
chart_colours = [charcoalColour, tealColour, lightGreenColour, purpleColour]
chart_colours_dict = { "Accuracy": charcoalColour, "Precision": tealColour, "Recall": lightGreenColour, "F1-Score": purpleColour}
In [ ]:
 

Overall Performance Metrics (Line Chart per Metric)¶

In [83]:
def addMetricFigure(fig, metric, row, col):
    
    # Plot Bar Chart
    fig_metric = go.Figure()
    # Assign Metrics
    fig_metric.add_trace(go.Scatter(x=metrics["Name"], y=metrics[metric], mode="lines+markers", name=metric, marker=dict(color=chart_colours_dict[metric], size=8)))
    # Assign Text
    fig_metric.add_trace(go.Scatter(x=metrics["Name"], y=metrics[metric], mode="text", name=metric, text=metrics[metric].round(2), textposition="top right"))
    # Add Sub Plots
    fig.add_trace(fig_metric.data[0], row=row, col=col)
    
In [84]:
# Set up Sub Plots
fig = make_subplots(rows=2, 
                    cols=2, 
                    subplot_titles=("Accuracy", "Precision", "Recall", "F1-Score"))
# Add 
addMetricFigure(fig, "Accuracy", 1, 1)
addMetricFigure(fig, "Precision", 1, 2)
addMetricFigure(fig, "Recall", 2, 1)
addMetricFigure(fig, "F1-Score", 2, 2)
# Update Title
fig.update_layout(title="Classification Models Subplots", 
                  title_x=0.5,
                  height=800)
# Show Chart
fig.show()

Overall Performance Metrics (Bar Chart Combined)¶

In [85]:
# Plot Bar Chart
fig = px.bar(metrics, 
             x="Name", 
             y=["Accuracy", "Precision", "Recall", "F1-Score"], 
             barmode="group",
             color_discrete_map=chart_colours_dict)
# Apply Titles and Axis Titles
fig.update_layout(title="Interactive Performance Metrics",
                  title_x=0.5,
                  height=700,
                  xaxis_title="Classification Model",
                  yaxis_title="Percentage")
# Limit Range
fig.update_yaxes(range=[95, 100])
# Show Chart
fig.show()
In [ ]:
 

Cross Validation Scoring¶

In [86]:
# Assign Metric Scoring
scores = {
    "Accuracy": "accuracy",
    "Precision": "precision",
    "Recall": "recall",
    "F1-Score": "f1"
}
In [87]:
def crossValidateMetric(model, name, X, y, cv=5):
        
    # Print Status
    #print(f"Started {name} - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Search through Items
    for metric, scoring in scores.items():
        
        # Run Cross Validation Score
        results = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=3)
        # Assign Value
        row = metrics_cvs[metrics_cvs["Name"] == name]
        # Assign Values
        metrics_cvs.loc[row.index, metric + " CVS"] = np.mean(results) * 100
        
    # Print Status
    #print(f"Completed {name} - " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
 
In [ ]:
 
In [88]:
# Assign Value
cv = 3
# Cross Validation Classification Models
crossValidateMetric(model_nb, classificationName_nb, X, y, cv)
crossValidateMetric(model_lr, classificationName_lr, X, y, cv)
crossValidateMetric(model_svm, classificationName_svm, X, y, cv)
crossValidateMetric(model_rf, classificationName_rf, X, y, cv)
crossValidateMetric(model_xg, classificationName_xg, X, y, cv)
crossValidateMetric(model_lg, classificationName_lg, X, y, cv)
crossValidateMetric(model_ab, classificationName_ab, X, y, cv)
crossValidateMetric(model_cb, classificationName_cb, X, y, cv)
In [89]:
# Print Out Metrics
metrics_cvs
Out[89]:
Name Accuracy Precision Recall F1-Score Accuracy CVS Precision CVS Recall CVS F1-Score CVS
1 Logistic Regression 99.4878 99.4887 99.4848 99.4868 98.521092 98.946011 97.954896 98.440256
2 Linear SVC 99.4432 99.4376 99.4470 99.4422 98.581229 98.848117 98.183686 98.507747
5 LightGBM 99.2650 99.2618 99.2656 99.2637 98.690365 98.959680 98.295746 98.622655
6 Adaptive Boosting 99.1314 99.1199 99.1414 99.1299 98.532229 98.621454 98.309754 98.459829
4 XG Boost 99.0980 99.0958 99.0968 99.0963 98.329547 98.733805 97.758790 98.239539
7 CatBoost 99.0200 99.0155 99.0211 99.0182 98.187002 97.919425 98.319092 98.108835
3 Random Forest 97.9176 97.9850 97.8635 97.9112 96.594503 98.534256 94.420320 96.475840
0 Naive Bayes 96.2806 96.3296 96.2297 96.2699 91.850416 95.863269 86.935612 90.934652
In [ ]:
 

Real-time News Analysis¶

¶

In [90]:
def analyseNewsArticle(title, text):

    # Assign Sentence
    sentence = title + " - " + text

    # Apply Preprocessing

    # Lowercase
    sentence = sentence.lower()
    # Remove URLs
    sentence = removeUrls(sentence)
    # Remove Punctuation
    sentence = sentence.translate(removepunctuation)
    # Remove Numbers
    sentence = re.sub(r"\d+", "", sentence)
    # Remove Unicode
    sentence = removeUnicode(sentence)
    # Lemmatize Words
    sentence = lemmatizeText(sentence)
    
    # Run through Pipelines

    # Transform Sentence
    sentence_vectorized = tfidfVectorizer_nb.transform([sentence])
    
    # Make Predictions 
    prediction = classifier_nb.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_lr.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_svm.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_rf.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_xg.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_lg.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_ab.predict(sentence_vectorized)[0]
    prediction = prediction + classifier_cb.predict(sentence_vectorized)[0]

    # Check Value
    if prediction >= 5:
        return "Real"
    else:
        return "Fake"   
In [ ]:
 

News API Analysis¶

¶

In [91]:
# Initialize NewsAPI
newsapi = NewsApiClient(api_key=newsAPIKey)
In [92]:
# Get Top Headlines - BBC News / CNN / NBC News / Russian TV
top_headlines = newsapi.get_top_headlines(q="",
                                          sources="bbc-news,cnn,nbc-news,rt",
                                          language="en", 
                                          page_size=20)
In [93]:
# Check Value
if top_headlines["status"] == "ok":
    
    # Print Spacer
    display(Markdown(f"#\n# Fake / Real News Analaysis:\n#"))
    
    # Go through each article
    for article in top_headlines["articles"]:
        
        # Assign Values
        title = article["title"]
        text = article["content"]
        url = article["url"]
        publishedDate = article["publishedAt"]
        publishedDate = publishedDate.replace("T", " ")
        publishedDate = publishedDate[:10]
        titleLink = f'<a href="{url}" target="_blank">{title}</a>'
        
        # Check Text
        if text is not None:
        
            # Analyse Article
            status = analyseNewsArticle(title, text)
            # Check article if Real / Fake
            display(Markdown(f"**Title:** {titleLink}\n**Published Date:** {publishedDate}\n**Status:** {status}"))
            
    # Print Spacer
    display(Markdown("#"))

¶

Fake / Real News Analaysis:¶

¶

Title: Italian man crushed to death under falling cheese wheels Published Date: 2023-08-07 Status: Real

Title: Tou Thao: Ex-officer in George Floyd case gets 57 months for role in killing Published Date: 2023-08-07 Status: Real

Title: Brick Lane: Chinese political slogans appear on London street art wall Published Date: 2023-08-07 Status: Real

Title: Hank the Tank: Fugitive burglar bear captured in California Published Date: 2023-08-07 Status: Real

Title: A Moscow summer with war on people's minds Published Date: 2023-08-07 Status: Real

Title: Ex-FBI counterintelligence chief in talks to plead guilty over work he did for Russian oligarch Published Date: 2023-08-07 Status: Real

Title: Matty Healy: The 1975 threatened with legal action after Malaysia festival cancellation Published Date: 2023-08-07 Status: Real

Title: Ahead of Ohio abortion vote, Republicans try to change the rules Published Date: 2023-08-07 Status: Real

Title: Only 1 in 5 adults with an opioid use disorder received medication to treat it in 2021 Published Date: 2023-08-07 Status: Real

Title: 162 infant deaths have been associated with nursing pillows since 2007, investigation finds Published Date: 2023-08-07 Status: Real

Title: 1 hurt in a possible explosion at a Sherwin-Williams paint factory plant in Texas Published Date: 2023-08-07 Status: Real

Title: Zelenskyy assassination plot: Ukraine detains Russian informant suspect Published Date: 2023-08-07 Status: Real

Title: The new liberal majority on the Wisconsin Supreme Court is off to a tense start Published Date: 2023-08-07 Status: Real

Title: Angus Cloud: Euphoria star's mother says his death was 'not intentional' Published Date: 2023-08-07 Status: Real

Title: More gay men can give blood as ‘one of the most significant changes in blood banking history’ gets underway Published Date: 2023-08-07 Status: Real

Title: Top librarian calls 'Marxist lesbian' tweet backlash 'regrettable' Published Date: 2023-08-07 Status: Real

Title: 'Of course he lost': Ron DeSantis rejects Trump's 2020 election claims Published Date: 2023-08-07 Status: Real

Title: Daimler Truck finance chief Jochen Goetz dies in ‘tragic incident’ Published Date: 2023-08-07 Status: Real

Title: WATCH: Moment Alaska house collapses into river Published Date: 2023-08-07 Status: Real

¶